import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, f1_score
import lightgbm as lgb
from collections import Counter
import warnings

def f1_score_eval(preds, valid_df):
    labels = valid_df.get_label()
    preds = np.argmax(preds.reshape(3, -1), axis=0)
    scores = f1_score(y_true=labels, y_pred=preds, average=None)
    scores = scores[0]*0.2+scores[1]*0.2+scores[2]*0.6
    return 'f1_score', scores, True


def lgb_train(train_: pd.DataFrame, test_: pd.DataFrame, use_train_feats: list, id_col: str, label: str,
              n_splits: int, split_rs: int, is_shuffle=True, use_cart=False, cate_cols=None) -> pd.DataFrame:
    if not cate_cols:
        cate_cols = []
    print('data shape:\ntrain--{}\ntest--{}'.format(train_.shape, test_.shape))
    print('Use {} features ...'.format(len(use_train_feats)))
    print('Use lightgbm to train ...')
    n_class = train_[label].nunique()
    train_[f'{label}_pred'] = 0
    test_pred = np.zeros((test_.shape[0], n_class))
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = use_train_feats

    folds = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=split_rs)
    train_user_id = train_[id_col].unique()

    params = {
        'learning_rate': 0.05,
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'metric': 'None',
        'num_leaves': 31,
        'num_class': n_class,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'seed': 1,
        'bagging_seed': 1,
        'feature_fraction_seed': 7,
        'min_data_in_leaf': 20,
        'nthread': -1,
        'verbose': -1
    }

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_user_id), start=1):
        print('the {} training start ...'.format(n_fold))
        train_x, train_y = train_.loc[train_[id_col].isin(train_user_id[train_idx]), use_train_feats], train_.loc[
            train_[id_col].isin(train_user_id[train_idx]), label]
        valid_x, valid_y = train_.loc[train_[id_col].isin(train_user_id[valid_idx]), use_train_feats], train_.loc[
            train_[id_col].isin(train_user_id[valid_idx]), label]
        print(f'for train user:{len(train_idx)}\nfor valid user:{len(valid_idx)}')

        if use_cart:
            dtrain = lgb.Dataset(train_x, label=train_y, categorical_feature=cate_cols)
            dvalid = lgb.Dataset(valid_x, label=valid_y, categorical_feature=cate_cols)
        else:
            dtrain = lgb.Dataset(train_x, label=train_y)
            dvalid = lgb.Dataset(valid_x, label=valid_y)

        clf = lgb.train(
            params=params,
            train_set=dtrain,
            num_boost_round=5000,
            valid_sets=[dvalid],
            early_stopping_rounds=100,
            verbose_eval=100,
            feval=f1_score_eval
        )
        fold_importance_df[f'fold_{n_fold}_imp'] = clf.feature_importance(importance_type='gain')
        train_.loc[train_[id_col].isin(train_user_id[valid_idx]), f'{label}_pred'] = np.argmax(
            clf.predict(valid_x, num_iteration=clf.best_iteration), axis=1)
        test_pred += clf.predict(test_[use_train_feats], num_iteration=clf.best_iteration) / folds.n_splits

    report = f1_score(train_[label], train_[f'{label}_pred'], average=None)
    print(classification_report(train_[label], train_[f'{label}_pred'], digits=4))
    print('Score: ', report[0] * 0.2 + report[1] * 0.2 + report[2] * 0.6)
    test_[f'{label}_pred'] = np.argmax(test_pred, axis=1)
    test_[label] = np.argmax(test_pred, axis=1)+1
    five_folds = [f'fold_{f}_imp' for f in range(1, n_splits + 1)]
    fold_importance_df['avg_imp'] = fold_importance_df[five_folds].mean(axis=1)
    fold_importance_df.sort_values(by='avg_imp', ascending=False, inplace=True)
    print(fold_importance_df[['Feature', 'avg_imp']].head(20))
    return test_[[id_col, 'current_slice_id', 'future_slice_id', label]]


if __name__ == "__main__":
    print("read attr....")
    attr = pd.read_csv('attr.txt', sep='\t',
                       names=['link', 'length', 'direction', 'path_class', 'speed_class', 'LaneNum', 'speed_limit',
                              'level', 'width'], header=None)
    print("read sum_data...")
    train = pd.read_csv('sum_data.txt')
    print("read is_test...")
    test = pd.read_csv('is_test.csv')
    print("lianjie...")
    train = train.merge(attr, on='link', how='left')
    test = test.merge(attr, on='link', how='left')
    print('del tezheng...')
    use_cols = [i for i in train.columns if
                i not in ['link', 'label', 'current_slice_id', 'future_slice_id', 'label_pred']]
    print("train.....")
    sub = lgb_train(train, test, use_cols, 'link', 'label', 5, 2020)
    print("to csv...")
    sub.to_csv('submit_example.csv', index=False, encoding='utf8')